

clear all
set more off
cap log close

***************************************************************************************************
* Program: process_lodes_data.do
* Purpose: 
/*
* LODES website: https://lehd.ces.census.gov/data/
* Data Dictionary: https://lehd.ces.census.gov/data/lodes/LODES8/LODESTechDoc8.1.pdf
*
* Steps:
* 1. Identify the Census Block Groups that contain California state prisons.
* 2. Construct a prison-ZCTA contact matrix using LODES data by aggregating:
*    home block group -> home census tract -> home ZCTA.
*
* Note: 7 block groups containing prisons are not included in the LODES dataset.
*/

***************************************************************************************************

************************************************************************
** set global macros
************************************************************************
global prison_dir "~/Dropbox/Prison_Covid/PNAS_Nexus_Replication"
global raw_data "${prison_dir}/raw_data"
global lodes_dir "${raw_data}/LODES_data"
global intermediate_data "${prison_dir}/intermediate_data"

***************************************************************************************
* 1. Obtain block group of CA state prison 
// match prison to block group based on lat-long of prison boundary
****************************************************************************************

use "${raw_data}/prison_boundaries/Prison_Boundaries_CA_coor.dta", clear 
rename _ID shapefile_ID 
drop if _X == .
// match prison lat long lat-long to census block group
geoinpoly _Y _X using "${raw_data}/bg_boundaries/tl_2019_06_bg_coor.dta"
// merge to get census block group FIPS code 
merge m:1 _ID using "${raw_data}/bg_boundaries/tl_2019_06_bg_data.dta", keepusing(GEOID) ///
	keep(match) nogenerate 
drop _ID 

// keep unique shapefile_ID GEOID 
sort shapefile_ID GEOID 
drop if shapefile_ID == shapefile_ID[_n-1] & GEOID == GEOID[_n-1] 

// match to get prison name 
rename shapefile_ID _ID 
merge m:1 _ID using "${intermediate_data}/cdcr_population_w_ShapefileID.dta", ///
	keepusing(NAME ANALYTICSAMPLE) keep(match) nogenerate 

// drop non-state prison 
drop if inlist(NAME, "CUESTA CONSERVATION CAMP #24", "DEPARTMENT OF STATE HOSPITALS - ATASCADERO", ///
				"DEPARTMENT OF STATE HOSPITALS - COALINGA", "MCFARLAND FEMALE COMMUNITY REENTRY FACILITY", ///
				"FOLSOM WOMEN'S FACILITY (FWF)")

// calculate the number of BG that a prison is matched to 
gunique GEOID, by(NAME) generate(num_bg_in_prison)		
				
// browse if num_bg_in_prison == 2
// there are 3 prisons that are matched to two BGs and manual check which one (or both) is correct: 
/*
CORRECTIONAL TRAINING FACILITY (CTF) -> should be 060530109001 (the other block group just crossed a bit of the boundary line)
SALINAS VALLEY STATE PRISON (SVSP) -> 060530109001 (Note: CTF and SVSP shared the same block group)
SAN QUENTIN STATE PRISON (SQ) -> mostly in 060411220001;
// but a bit cross 060411212002 (however this block group includes too much non-prison buildings so will drop this BG)
*/

// drop "incorrectly" matched block groups 
drop if NAME == "CORRECTIONAL TRAINING FACILITY (CTF)" & GEOID != "060530109001"
drop if NAME == "SALINAS VALLEY STATE PRISON (SVSP)" & GEOID != "060530109001"
drop if NAME == "SAN QUENTIN STATE PRISON (SQ)" & GEOID != "060411220001"

rename GEOID gidbg 
destring gidbg, replace 

// calculate the number of prison that a BG is matched to 
gunique NAME, by(gidbg) generate(num_prison_in_bg)
// 7 block groups are matched to 2 prisons 

drop num_bg_in_prison

save "${intermediate_data}/CA_prison_location_matchedbg.dta", replace 


/*
LIST of block group that have 2 prisons
gidbg	NAME
60310016021	CALIFORNIA STATE PRISON, CORCORAN (COR)
60310016021	CA SUBSTANCE ABUSE TREATMENT FACILITY (SATF)
60350404002	CALIFORNIA CORRECTIONAL CENTER (CCC)
60350404002	HIGH DESERT STATE PRISON (HDSP)
60390002011	CENTRAL CALIFORNIA WOMEN'S FACILITY (CCFW)
60390002011	VALLEY STATE PRISON (VSP)
60530109001	CORRECTIONAL TRAINING FACILITY (CTF)
60530109001	SALINAS VALLEY STATE PRISON (SVSP)
60659810001	IRONWOOD STATE PRISON (ISP)
60659810001	CHUCKAWALLA VALLEY STATE PRISON (CVSP)
60679883001	CALIFORNIA STATE PRISON, SACRAMENTO (SAC)
60679883001	FOLSOM STATE PRISON (FSP)
60952530001	CALIFORNIA MEDICAL FACILITY (CMF)
60952530001	CALIFORNIA STATE PRISON, SOLANO (SOL)
*/

* create a unique list of BG containing prisons for later merge 
use "${intermediate_data}/CA_prison_location_matchedbg.dta", clear 
sort gidbg 
drop if gidbg == gidbg[_n-1]
keep gidbg num_prison_in_bg
save "${intermediate_data}/uniq_bg_list_w_prisons.dta", replace 


****************************************************************************************
* 2.1 Obtain a contact matrix between block group (prison) - block group (home)
****************************************************************************************

// import LODES data // JT01 means primary job 
import delimited "${lodes_dir}/ca_od_main_JT01_2020.csv", clear

// gen double w_tract = floor(w_geocode/10000)
gen double w_bg = floor(w_geocode/1000)
gen double h_tract = floor(h_geocode/10000)
gen double h_bg = floor(h_geocode/1000)

format w_* h_*  %16.0g

// merge to only keep work block groups that contain prison 
rename w_bg gidbg 
merge m:1 gidbg using "${intermediate_data}/uniq_bg_list_w_prisons.dta"
rename gidbg w_bg 
keep if _merge == 3
drop _merge 

/*
Note: there are 7 block groups that contain prison are NOT in the LODES data 

	Result                      Number of obs
	-----------------------------------------
	Not matched                    14,292,936
		from master                14,292,929  (_merge==1)
		from using                          7  (_merge==2)

	Matched                            32,539  (_merge==3)
	-----------------------------------------

list of bg with _merge == 2
w_bg
60190079011
60310016021
60710019032
60710122001
60730100141
60770055011
60790114001

// note: I tried other versions of LODES data, like ca_od_main_JT01_2020.csv, ca_od_main_JT00_2020.csv
// it's always these 7 BGs that are not matched to the LDOES observations 
*/ 

// calculate the total contact (i.e. jobs) between home block groups and work block groups 
// s000: Total number of jobs
// si03: Number of jobs in All Other Services industry sectors
gegen wbg_hbg_s000 = total(s000), by(w_bg h_bg)
gegen wbg_hbg_si03 = total(si03), by(w_bg h_bg)


// collapse to unique w_bg h_bg pair 
sort w_bg h_bg 
drop if w_bg == w_bg[_n-1] & h_bg == h_bg[_n-1]

// summarize this measure 
sum wbg_hbg_s000 wbg_hbg_si03

keep w_bg h_bg wbg_hbg_* h_tract num_prison_in_bg

save "${intermediate_data}/LODES_od_main_JT01_2020_matrix_prisonBG_to_homeBG.dta", replace 

****************************************************************************************
* 2.2 create work block group to home tract matrix 
// that's because there is only crosswalk between home tract and ZCTA 
****************************************************************************************

use "${intermediate_data}/LODES_od_main_JT01_2020_matrix_prisonBG_to_homeBG.dta", clear 

gegen wbg_htract_s000 = total(wbg_hbg_s000), by(w_bg h_tract)
gegen wbg_htract_si03 = total(wbg_hbg_si03), by(w_bg h_tract)
	
// collapse to unique w_bg h_tract pair 
sort w_bg h_tract
drop if w_bg == w_bg[_n-1] & h_tract == h_tract[_n-1]

// summarize contact (i.e. total jobs)
sum wbg_htract*

keep w_bg wbg_htract* h_tract 

save "${intermediate_data}/LODES_od_main_JT01_2020_matrix_prisonBG_to_homeTract.dta", replace 


****************************************************************************************
* 2.3 Obtain ZCTA-Tract Crosswalk
****************************************************************************************

* import ZCTA-tract crosswalk from census 
import delimited "${lodes_dir}/zcta_tract_rel_10.txt", clear 

/* CODEBOOK: 

POPPT	14	Calculated 2010 Population for the relationship record
HUPT	14	Calculated 2010 Housing Unit Count for the relationship record
AREAPT	14	Total Area for the record
AREALANDPT	14	Land Area for the record
ZPOP	14	2010 Population of the 2010 ZCTA
TRPOP	14	2010 Population of the 2010 Census Tract
ZPOPPCT	5	The Percentage of Total Population of the 2010 ZCTA represented by the record
TRPOPPCT	5	The Percentage of Total Population of the 2010 Census Tract represented by the record

*/
rename geoid FIPS_tract 

// since 1 tract can be matched to multiple ZCTAs 
// thus, will assign jobs between BG-ZCTAs by weighting the share of tract population in the ZCTA (TRPOPPCT)
keep FIPS_tract zcta5 trpoppct

// rescale trpoppct to 0-1
replace trpoppct = trpoppct / 100

save "${lodes_dir}/zcta_tract_rel_10.dta", replace 

****************************************************************************************
* 2.3 Obtain San Quentin prison - home ZCTA contact matrix w/ LODES
****************************************************************************************

// read wBG-hTract data 
use "${intermediate_data}/LODES_od_main_JT01_2020_matrix_prisonBG_to_homeTract.dta", clear 

// get all pairwise combination of tract-ZCTA 
rename h_tract FIPS_tract 
joinby FIPS_tract using "${lodes_dir}/zcta_tract_rel_10.dta" 

// calculate tha amount of tract contact that are assigned to a ZCTA 
// by weighting the share of tract population in the ZCTA (TRPOPPCT)
gen zcta_tract_s000 = wbg_htract_s000 * trpoppct
gen zcta_tract_si03 = wbg_htract_si03 * trpoppct

// calculate total contact between block group and home ZCTA 
gegen zcta_s000 = total(zcta_tract_s000), by(w_bg zcta5)
gegen zcta_si03 = total(zcta_tract_si03), by(w_bg zcta5)

// collapse to BG-ZCTA level 
sort w_bg zcta5
drop if w_bg == w_bg[_n-1] & zcta5 == zcta5[_n-1]

rename zcta5 h_zcta 

keep w_bg h_zcta zcta_s000 zcta_si03

// keep the BG that contain San Quentin SP
keep if w_bg == 060411220001

// Note: in that BG, all jobs are in other industry (which makes sense for prison)
assert zcta_s000 == zcta_si03 if w_bg == 060411220001
// Thus drop zcta_si03 
drop zcta_si03 

// fill in zero 
rename h_zcta ZCTA 
merge 1:1 ZCTA using "${intermediate_data}/CA_ZCTA_uniq_list.dta"
drop if _merge == 1
replace zcta_s000 = 0 if _merge == 2
drop _merge w_bg // because w_bg == . if _merge == 2

gen NAME = "SAN QUENTIN STATE PRISON (SQ)"

save "${intermediate_data}/LODES_2020_SQSPBG_to_homeZCTA.dta", replace


****************************************************************************************
* 2.4 - Get total number of jobs by prison BG
****************************************************************************************

import delimited "${lodes_dir}/ca_od_main_JT01_2020.csv", clear

// gen double w_tract = floor(w_geocode/10000)
gen double w_bg = floor(w_geocode/1000)
gen double h_tract = floor(h_geocode/10000)
gen double h_bg = floor(h_geocode/1000)

format w_* h_*  %16.0g

// merge to only keep work block groups that contain prison 
rename w_bg gidbg 
merge m:1 gidbg using "${intermediate_data}/uniq_bg_list_w_prisons.dta"
rename gidbg w_bg 
keep if _merge == 3
drop _merge 
// Note: there are 7 block groups that contain prison are NOT in the LODES data 

gegen wbg_s000 = total(s000), by(w_bg)
gegen wbg_si03 = total(si03), by(w_bg)

// collapse to unique w_bg h_bg pair 
sort w_bg
drop if w_bg == w_bg[_n-1]

// summarize this measure 
sum wbg_s000 wbg_si03

keep w_bg wbg_* num_prison_in_bg

save "${intermediate_data}/LODES_tot_jobs_JT01_2020_byprisonBG.dta", replace 

